### Replication file for
# Title: From Foe to Friend? Government-Opposition Conflict and the Appointment of Cabinet Ministers
# Authors: Herzog, Alexander (alexander.herzog@uni-bamberg.de ; University of Bamberg); Schmuck, David (david.schmuck@uni-bamberg.de ; University of Bamberg , corresponding author)
# Journal: Political Science Research and Methods

# APPENDIX B

# Pre-Settings ----
# Set working directory
# Note: Set working directory for log and input file accordingly
setwd("")

# Clear environment
rm(list = ls(all = TRUE))

# Install and load packages
library(tidyverse) # for data manipulation
library(quanteda) # for text-as-data applications
library(janitor) # for adding column totals
library(kableExtra) # to save kable-generated tables

# Load data file
load("Replication data - PSRM - Herzog Schmuck - From Foe to Friend - Appendix B.RData")

# Generate debate-speaker grouping variable
debates_speeches_df <- debates_speeches_df |> 
  mutate(debate_speaker_group = factor(paste0(debate_id, "-", speaker_id)))

# Generate corpus
corp <- corpus(debates_speeches_df, text_field = "speaker_speech")

# Remove df, which we no longer need, to save memory
rm(debates_speeches_df)
gc()

# Generate tokens and remove punctuation, symbols and numbers
toks <- tokens(corp,
               remove_punct = TRUE,
               remove_symbols = TRUE,
               remove_numbers = TRUE)

# Generate DFM
dfmat <- dfm(toks)

# Group by debate and speaker
dfmat <- dfm_group(dfmat, groups = debate_speaker_group)

# Remove stopwords
dfmat <- dfm_remove(dfmat, pattern = stopwords("de"))
dfmat <- dfm_remove(dfmat, pattern = "dass")

# Stemming
dfmat <- dfm_wordstem(dfmat)

# Remove single-letter features
dfmat <- dfm_keep(dfmat, min_nchar = 2)

# Remove speeches with zero words
dfmat <- dfm_subset(dfmat, rowSums(dfmat) > 0)

# Plot speech length and number of speakers before data filtering
data <- docvars(dfmat)
data <- data |> 
  mutate(speech_length = rowSums(dfmat))

# Figure B.1: Distribution of number of speakers, per debate ----
data <- data |>
  group_by(debate_id) |> 
  add_tally(n_distinct(speaker_id), name = "n_speakers_debate") |> 
  ungroup()

figB1 <- data |>
  ggplot(aes(x=n_speakers_debate)) +
  geom_bar() +
  scale_x_continuous(expand = c(0, 0), breaks = seq(0, 110, 2)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 40000), breaks = seq(0, 40000, 2500)) +
  xlab("Number of speakers per debate") +
  ylab("Count") +
  ggtitle("Distribution of number of speakers per debate.")

# Plot the graph
plot(figB1)

# Save plot
ggsave(
  filename = "Herzog_Schmuck_FigB1_Number_of_speakers.tiff",
  device='tiff',
  dpi = 200,
  width = 11,
  height = 9.89,
  units = "in",
  compression = "lzw"
)

# Figure B.2: Distribution of speech_length ----
figB2 <- data |>
  filter(speech_length <= 1000) |> 
  ggplot(aes(x=speech_length)) +
  geom_histogram(binwidth = 10) +
  scale_x_continuous(expand = c(0, 0), breaks = seq(0, 1000, 50)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 8000), breaks = seq(0, 8000, 500)) +
  xlab("Speech lenght (number of words)") +
  ylab("Count") +
  ggtitle("Distribution of speech length after stemming and stopwords removal; speeches with 1000 words or fewer only.")

# Plot the graph
plot(figB2)

# Save plot
ggsave(
  filename = "Herzog_Schmuck_FigB2_Speech_length.tiff",
  device='tiff',
  dpi = 200,
  width = 11,
  height = 9.89,
  units = "in",
  compression = "lzw"
)

# Continue data filtering
# Remove speeches with fewer than k words AND less than five speakers
k <- 150
dfmat <- dfm_subset(dfmat, rowSums(dfmat)>=k)

# Remove debates with less than 5 speakers  
data <- docvars(dfmat) # re-create the data object because we changed 'dfmat' and hence 'docvars(dfmat)'
data <- data |>
  group_by(debate_id) |> 
  add_tally(n_distinct(speaker_id), name = "n_speakers_debate") |> 
  ungroup()

dfmat <- dfm_subset(dfmat, data$n_speakers_debate >= 5)

# Remove features with 0 appearances after removing debates
dfmat <- dfm_trim(dfmat, min_termfreq = 1)

# Table B.1: Descriptive statistics of legislative speech data, 1961 to 2021 ---- 
data <- docvars(dfmat)

tabB1 <- data |> 
  filter(cabinet_id >= 4) |> 
  group_by(cabinet_name) |> 
  summarize(N_debates  = n_distinct(debate_id),
            N_speakers = n_distinct(speaker_id),
            N_speeches = n(),
            avg_N      = round(N_speeches/N_speakers,1))
tabB1 <- tabB1 |>  adorn_totals() 

# Print Table B.1
print(tabB1)
save_kable(kable(tabB1, format = "html"), file = "Herzog_Schmuck_TabB1_descriptive_statistics_speakers_speeches.html")




